Insights from medical posts

In this notebook, I try to find characteristics of medical posts.

What is the ratio of post from professionals vs. those from general public?
What are the characteristics that well-separate professional-level posts?
- Length of text
- Usage of offending vocabulary
- Writing level



In [1]:

    
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)



In [2]:

    
# Load datasets
import pandas as pd

df =  pd.read_csv("MedHelp-posts.csv",index_col=0)
df.head(2)









    Out[2]:







  
    
      
      title
      text
      href
      user id
      mother post id
    
    
      post id
      
      
      
      
      
    
  
  
    
      1
      Inappropriate Masterbation Down Syndrome
      \n        It is common for children and adoles...
      http://www.medhelp.org//posts/Autism--Asperger...
      user_340688
      1
    
    
      2
      Inappropriate Masterbation Down Syndrome
      \n        A related discussion, self injusry i...
      http://www.medhelp.org//posts/Autism--Asperger...
      user_1566928
      1



In [3]:

    
df_users = pd.read_csv("MedHelp-users.csv",index_col=0)
df_users.head(2)









    Out[3]:







  
    
      
      user description
    
    
      user id
      
    
  
  
    
      user_340688
      Rachel  Thompson, Ph.D., BCBA
    
    
      user_1566928
      CirclesLady29



In [4]:

    
# 1 classify users as professionals and general public:

df_users['is expert']=0

for user_id in df_users.index:
    user_description=df_users.loc[user_id,['user description']].values
    if ( "," in user_description[0]):
        print(user_description[0])
        df_users.loc[user_id,['is expert']]=1

# Save database:
df_users.to_csv("MedHelp-users-class.csv")









    



Rachel  Thompson, Ph.D., BCBA
Myrna  Libby, Ph.D., BCBA
Jason C Bourret, Ph.D., BCBA-D
Tali  Shenfield, PhD
Richard B. Graff, PhD, BCBA-D
Jessica L Thomason Sassi, Ph.D., BCBA-D
William L. Holcomb, Ph.D., BCBA-D
Eileen  Roscoe, PhD
Rebecca  MacDonald, Ph.D., BCBA
William H Ahearn, Ph.D., BCBA



In [5]:

    
is_expert=df_users['is expert'] == 1
is_expert.value_counts()









    Out[5]:





False    495
True      10
Name: is expert, dtype: int64

Only 10 out of 505 users are experts!
This corresponds to 2 % of users.



In [6]:

    
# Select user_id from DB where is_professional = 1
experts_ids = df_users[df_users['is expert'] == 1 ].index.values
experts_ids

non_experts_ids = df_users[df_users['is expert'] == 0 ].index.values



In [7]:

    
# Select * where user_id in experts_ids
#df_users.loc[df_users.index.isin(experts_ids)]

df_experts=df.loc[df['user id'].isin(experts_ids)]
print('Total of posts from expert users {}'.format(len(df_experts)))
print('Total of posts {}'.format(len(df)))
print('Ratio {}'.format(len(df_experts)/len(df)))
del df_experts









    



Total of posts from expert users 727
Total of posts 1813
Ratio 0.40099282956425814

Length ot text



In [8]:

    
# Tokenize data
import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')

# Get the length of tokens into a columns
df_text = df['text'].str.lower()
df_token = df_text.apply(tokenizer.tokenize)
df['token length'] = df_token.apply(len)




# Get list of tokens from text in first article:

#for text in df_text.values:
#    ttext = tokenizer.tokenize(text.lower())
#    lenght_text=len(ttext)
#    break



In [9]:

    
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
from scipy.stats import norm
import numpy as np
from scipy.optimize import curve_fit
from lognormal import lognormal, lognormal_stats,truncated_normal
from scipy.stats import truncnorm



In [10]:

    
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True

plt.rcParams.update({'font.size': 24})

nbins=100


fig = plt.figure()
#fig=plt.figure(figsize=(2,1))
#fig.set_size_inches(6.6,3.3)
gs = gridspec.GridSpec(2, 1)
#plt.subplots_adjust(left=0.1,right=1.0,bottom=0.17,top=0.9)


#plt.suptitle('Text length (words count)')
fig.text(0.04,0.5,'Distribution',va='center',rotation='vertical')









    Out[10]:





<matplotlib.text.Text at 0x11e341710>



In [12]:

    
#X ticks

xmax=1000
x=np.arange(0,xmax,100) #xtics
xx=np.arange(1,xmax,1)

# Panel 1
ax1=plt.subplot(gs[0])
ax1.set_xlim([0, xmax])
ax1.set_xticks(x)
ax1.tick_params(labelbottom='off')    


#plt.ylabel('')
#Class 0
X=df.loc[df['user id'].isin(non_experts_ids)]['token length'].values
n,bins,patches=plt.hist(X,nbins,normed=1,facecolor='cyan',align='mid')

popt,pcov = curve_fit(truncated_normal,bins[:nbins],n)
c0,=plt.plot(xx,truncated_normal(xx,*popt),color='blue',label='non expert')
plt.legend(handles=[c0],bbox_to_anchor=(0.45, 0.95), loc=2, borderaxespad=0.)

print(popt)
mu=X.mean()
var=X.var()
print("Class 0: Mean,variance: ({},{})".format(mu,var))



# Panel 2
ax2=plt.subplot(gs[1])
ax2.set_xlim([0, xmax])
ax2.set_xticks(x)
#ax2.set_yticks(np.arange(0,8,2))
#plt.ylabel('Normal distribution')

#Class 1
X=df.loc[df['user id'].isin(experts_ids)]['token length'].values
#(mu,sigma) = norm.fit(X)
n,bins,patches=plt.hist(X,nbins,normed=1,facecolor='orange',align='mid')
popt,pcov = curve_fit(lognormal,bins[:nbins],n)
#c1,=plt.plot(xx,mlab.normpdf(xx, mu, sigma),color='darkorange',label='layered')
c1,=plt.plot(xx,lognormal(xx,*popt),color='red',label='expert')
plt.legend(handles=[c1],bbox_to_anchor=(0.45, 0.95), loc=2, borderaxespad=0.)
print("Class 1: Mean,variance:",lognormal_stats(*popt))


#plt.xlabel('Volume ratio (theor./expt.)')

plt.show()









    



[ 96.97554606  52.03452703]
Class 0: Mean,variance: (176.15745856353593,37789.182389121204)
Class 1: Mean,variance: (159.5126437825775, 14611.355984084914)

#This is a useful example of truncated Gaussian fig=plt.figure() from scipy.stats import truncnorm def get_truncated_normal(mean=0, sd=1, low=0, upp=10): return truncnorm( (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd) X1 = get_truncated_normal(mean=0, sd=1, low=1, upp=10) X2 = get_truncated_normal(mean=5.5, sd=1, low=1, upp=10) X3 = get_truncated_normal(mean=8, sd=1, low=1, upp=10) import matplotlib.pyplot as plt fig, ax = plt.subplots(3, sharex=True) ax[0].hist(X1.rvs(10000), normed=True) ax[1].hist(X2.rvs(10000), normed=True) ax[2].hist(X3.rvs(10000), normed=True) plt.show()



In [ ]:

	title	text	href	user id	mother post id
post id
1	Inappropriate Masterbation Down Syndrome	\n It is common for children and adoles...	http://www.medhelp.org//posts/Autism--Asperger...	user_340688	1
2	Inappropriate Masterbation Down Syndrome	\n A related discussion, self injusry i...	http://www.medhelp.org//posts/Autism--Asperger...	user_1566928	1

	user description
user id
user_340688	Rachel Thompson, Ph.D., BCBA
user_1566928	CirclesLady29